Week 3 Live Coding

Load Necessary Libraries

library(ggplot2) #for graphing
library(viridis) #color palettes
library(MetBrewer) #color palettes
library(tidyverse) #for data cleaning

Load in Dataset

mydata <- read.csv("/Data/SampleDataSet.csv")
head(mydata)
##   ID Gender    Treatment Test1 Test2
## 1  1   Male      Control    98    32
## 2  2 Female Experimental    87    33
## 3  3 Female      Control    89    54
## 4  4 Female      Control    88    44
## 5  5   Male Experimental    76    64
## 6  6   Male      Control    68    54

Two Numeric Variables: Scatterplot

Playing with Settings (goes in the GEOM).

ggplot(mydata, aes(x = Test1, y = Test2)) +
  geom_point(color = "purple", size = 3, alpha = .5) + 
  theme_minimal() +
  labs(x = "Test 1 Score", y = "Test 2 Score", 
       title  = "Test Scores") +
  geom_smooth(method = "lm", se = FALSE, color = 
                "green")

Playing with Aesthetics (goes in the AES).

Aesthetics changes the graph based on values of another variable.

ggplot(mydata, aes(x = Test1, y = Test2,
                   size = Treatment, color = Gender)) +
  geom_point() + 
  theme_minimal() +
  labs(x = "Test 1 Score", y = "Test 2 Score", 
       title  = "Test Scores") +
  scale_color_viridis(discrete=TRUE, option = "plasma")

Facet Grid: one dimensional wrapping

ggplot(mydata, aes(x = Test1, y = Test2)) +
  geom_point() + 
  theme_minimal() +
  labs(x = "Test 1 Score", y = "Test 2 Score", 
       title  = "Test Scores") +
  facet_wrap(. ~ Treatment)

Facet Grid: two dimensional wrapping

ggplot(mydata, aes(x = Test1, y = Test2, color = Gender)) +
  geom_point() + 
  theme_minimal() +
  labs(x = "Test 1 Score", y = "Test 2 Score", 
       title  = "Test Scores") +
  facet_wrap(Gender ~ Treatment) +
  scale_color_viridis(discrete=TRUE) +
  guides(color = "none") +
  geom_smooth(method = "lm", se = F)

One Numeric & One Categorical Variable: Barplot

The goal: Look at the mean difference in Test2 between Treatment groups. In this case we use stat_summary() instead of a geom. This will allow us to plot a mean.

ggplot(mydata, aes(x = Treatment, y = Test2)) +
  stat_summary(fun = "mean", geom = "bar", fill = "blue") +
  theme_minimal() +
  labs(y = "Test 2 Score", title = "Test 2 Difference by Treatment Condition")

Adding on a second categorical variable

ggplot(mydata, aes(x = Treatment, y = Test2, fill = Gender)) +
  stat_summary(fun = "mean", geom = "bar", position = "dodge") +
  theme_minimal() +
  labs(y = "Test 2 Score", title = "Test 2 Difference by Treatment Condition") +
  scale_fill_manual(values=met.brewer("Renoir", 3))

MetBrewer::colorblind_palettes
##  [1] "Cassatt1"    "Cassatt2"    "Derain"      "Egypt"       "Greek"      
##  [6] "Hiroshige"   "Hokusai2"    "Hokusai3"    "Ingres"      "Isfahan1"   
## [11] "Isfahan2"    "Morgenstern" "OKeeffe1"    "OKeeffe2"    "Pillement"  
## [16] "Troy"        "VanGogh3"    "Veronese"
ggplot(mydata, aes(x = Treatment, y = Test2, fill = Gender)) +
  stat_summary(fun = "mean", geom = "bar", position = "dodge") +
  theme_minimal() +
  labs(y = "Test 2 Score", title = "Test 2 Difference by Treatment Condition") +
  scale_fill_manual(values=met.brewer("VanGogh3"))

One Categorical Variable: Frequency Barplot

For this, we will switch datasets to one with more categorical variables: data from the titanic.

titanic <- read.csv("../Data/titanic.csv", stringsAsFactors = TRUE)
head(titanic)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q

With one categorical variable, we can plot the frequency of the levels.

ggplot(titanic, aes(x = as.factor(Survived), fill = Survived)) +
  geom_bar() +
  geom_text(stat='count', aes(label=..count..), vjust=-1) +
  ylim(0,600) +
  theme_minimal() +
  labs(x = "Survived", y = "Frequency", title = "Titanic Survival") +   scale_x_discrete(labels=c("0" = "Did Not Survive", "1" = "Survived")) +
guides(fill ="none")

Two Categorical Variables: Frequency Barplot

With two categorical variables, we can plot the frequency of the levels.

ggplot(titanic, aes(x = as.factor(Survived), fill = Sex)) +
  geom_bar() +
  geom_text(stat='count', aes(label=..count..), position = position_stack(vjust = 0.5), color = "white") +
  ylim(0,600) +
  theme_minimal() +
  labs(x = "Survived", y = "Frequency", title = "Titanic Survival by Gender") +   scale_x_discrete(labels=c("0" = "Did Not Survive", "1" = "Survived")) +
  scale_fill_manual(labels=c("female" = "Female", "male" = "Male"), values=met.brewer("Renoir"))

Categorical Variables Alternative: Mosaic Plot

Mosaic charts can display the relationship between categorical variables using rectangles whose areas represent the proportion of cases for any given combination of levels. The color of the tiles can also indicate the degree relationship among the variables.

#https://rkabacoff.github.io/datavis/Models.html#Mosaic


levels(titanic$Sex) <- c("Female", "Male")

titanic$Class <- as.factor(titanic$Pclass)
levels(titanic$Class) <- c("1st", "2nd", "3rd")

# create a table
tbl <- xtabs(~Survived + Class + Sex, titanic)
ftable(tbl)
##                Sex Female Male
## Survived Class                
## 0        1st            3   77
##          2nd            6   91
##          3rd           72  300
## 1        1st           91   45
##          2nd           70   17
##          3rd           72   47
row.names(tbl) <- c("No", "Yes")

ftable(tbl)
##                Sex Female Male
## Survived Class                
## No       1st            3   77
##          2nd            6   91
##          3rd           72  300
## Yes      1st           91   45
##          2nd           70   17
##          3rd           72   47
# create a mosaic plot from the table
library(vcd)
mosaic(tbl, main = "Titanic data")

The size of the tile is proportional to the percentage of cases in that combination of levels. Clearly more passengers perished, than survived. Those that perished were primarily 3rd class male passengers (the largest group).

If we assume that these three variables are independent, we can examine the residuals from the model and shade the tiles to match. In the graph below, dark blue represents more cases than expected given independence. Dark red represents less cases than expected if independence holds.

mosaic(tbl, 
       shade = TRUE,
       legend = TRUE,
       labeling_args = list(set_varnames = c(Sex = "Gender",
                                             Survived = "Survived",
                                             Class = "Passenger Class")),
       set_labels = list(Survived = c("No", "Yes"),
                         Class = c("1st", "2nd", "3rd", "Crew"),
                         Sex = c("F", "M")),
       main = "Titanic data")

We can see that if class, gender, and survival are independent, we are seeing many more males perishing, and 1st and 2nd class females surviving than would be expected. Conversely, far fewer 1st, 2nd, and 3rd class passengers (female) died than would be expected by chance. Thus the assumption of independence is rejected.

Boxplots

A boxplot displays the 25th percentile, median, and 75th percentile of a distribution. The whiskers (vertical lines) capture roughly 99% of a normal distribution, and observations outside this range are plotted as points representing outliers

ggplot(titanic, 
       aes(x = Class, 
           y = Age)) +
  geom_boxplot() +
  labs(title = "Age distribution by Class") +
  theme_minimal()

Notched Boxplots

Notched boxplots provide an approximate method for visualizing whether groups differ. Although not a formal test, if the notches of two boxplots do not overlap, there is strong evidence (95% confidence) that the medians of the two groups differ.

# plot the distribution of salaries by rank using boxplots
ggplot(titanic, aes(x = Class, 
                     y = Age)) +
  geom_boxplot(notch = TRUE, 
               fill = "cornflowerblue", 
               alpha = .7) +
  labs(title = "Age distribution by Class") +
  theme_minimal()

Voilin Plot

A violin plot is a hybrid of a box plot and a kernel density plot, which shows peaks in the data. It is used to visualize the distribution of numerical data. Unlike a box plot that can only show summary statistics, violin plots depict summary statistics and the density of each variable.

ggplot(titanic, 
       aes(x = Class, 
           y = Age)) +
  geom_violin() +
  labs(title = "Age distribution by Class") +
  theme_minimal()

Boxplot + Violin plot MASHUP

A useful variation is to superimpose boxplots on violin plots.

#https://rkabacoff.github.io/datavis/Bivariate.html

ggplot(titanic, 
       aes(x = Class, 
           y = Age)) +
  geom_violin(fill = "cornflowerblue") +
  geom_boxplot(width = .2, 
               fill = "orange",
               outlier.color = "orange",
               outlier.size = 2) + 
  labs(title = "Age distribution by Class") +
  theme_minimal()

Ridgeline Plots

A ridgeline plot (also called a joyplot) displays the distribution of a quantitative variable for several groups. They’re similar to kernel density plots with vertical faceting, but take up less room. Ridgeline plots are created with the ggridges package.

library(ggridges)
ggplot(titanic, 
       aes(x = Age, 
           y = Class, 
           fill = Class)) + 
  geom_density_ridges() + 
  theme_ridges() +
  labs(title = "Age distribution by Class") +
  theme(legend.position = "none") +
  scale_fill_manual(values=met.brewer("Hokusai2"))

Combining jitter and barplots

ggplot(titanic, 
       aes(x = Class, 
           y = Age, 
           fill = Class)) +
  stat_summary(fun  = "mean", geom = "bar", position = "dodge", alpha = .7) +
  geom_jitter(alpha = 0.3, 
              width=.2, aes(color = Class)) +
  labs(title = "Age distribution by Class",
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values=met.brewer("Hokusai2")) +
  scale_color_manual(values=met.brewer("Hokusai2"))

Combining jitter and boxplots

# by rank using jittering
ggplot(titanic, 
       aes(x = Class, 
           y = Age, 
           color = Class)) +
  geom_boxplot(size=1,
               outlier.shape = 1,
               outlier.color = "black",
               outlier.size  = 3) +
  geom_jitter(alpha = 0.5, 
              width=.2) +
  labs(title = "Age distribution by Class",
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") +
  coord_flip() +
  scale_color_manual(values=met.brewer("Hokusai2"))

Combining scatter and boxplots

Before moving on, it is worth mentioning the geom_boxjitter function provided in the ggpol package. It creates a hybrid boxplot - half boxplot, half scatterplot.

library(ggpol)
ggplot(titanic, 
       aes(x = Class, 
           y = Age, 
           fill= Class)) +
  geom_boxjitter(color="black",
                 jitter.color = "darkgrey",
                 errorbar.draw = TRUE) +
  labs(title = "Age distribution by Class",
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values=met.brewer("Hokusai2"))

Beeswarm Plots

Beeswarm plots (also called violin scatter plots) are similar to jittered scatterplots, in that they display the distribution of a quantitative variable by plotting points in way that reduces overlap. In addition, they also help display the density of the data at each point (in a manner that is similar to a violin plot).

library(ggbeeswarm)
ggplot(titanic, 
       aes(x = Class, 
           y = Age, 
           color = Class)) +
  geom_quasirandom(alpha = 0.7,
                   size = 1.5) + 
  labs(title = "Age distribution by Class",
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_color_manual(values=met.brewer("Cassatt2"))

Conclusion

GGPLOT is fun! When first learning, the two most common plots you will make will be scatterplots and barplots (with stat_summay()). Start here, get comfortable, and then start playing around. More plots

Scatterplots

ggplot(titanic, aes(x = Age, y = Fare, color = Class)) +
  geom_point(alpha = .8) +
  theme_minimal() +
  labs(title = "Age x Fare by Class") +
  scale_color_viridis(discrete = TRUE, option = "mako")

Barplots (mean)

ggplot(titanic, aes(x = Class, y = Fare, fill = Sex)) +
  stat_summary(fun = "mean", geom ="bar", position = "dodge") +
  theme_minimal() +
  labs(title = "Sex x Class on Fare") +
  scale_fill_viridis(discrete = TRUE, option = "mako")